# -*- coding:utf8 -*-
# !/usr/bin/env python

'''
#全国企业信用信息公示系统（天津）
#维护黄羽
'''

import re
from bs4 import BeautifulSoup
import traceback
import requests
import json
import copy
import sys

from scpy.logger import get_logger
from scpy.xawesome_time import parse_time

from utils import kill_captcha
import table
import sd_template_dict as TE

reload(sys)
sys.setdefaultencoding('utf8')

logger = get_logger(__file__)


def get_url(word, items):
    for item in items:
        if "name" in item and "url" in item:
            if word == item["name"]:
                return item["url"]
    return ''


ua = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/49.0.2623.108 Chrome/49.0.2623.108 Safari/537.36"


def download_captcha_kill(companyName):
    """
    下载验证码，破解，然后搜索公司
    :param companyName:
    :param province:
    :return:若验证码破解成功且公司存在公司，返回公司网页。
            若公司不存在返回None
            若破解的验证码错误，返回''
            若破解过程、访问网页出现失败，抛出异样
    """
    # 下载验证码
    img_url = r'http://tjcredit.gov.cn/verifycode?date=1444705199335'
    img_headers = {
        'Accept': 'image/webp,image/*,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate, sdch',
        'Accept-Language': 'zh-CN,zh;q=0.8',
        'Connection': 'keep-alive',
        'Host': 'tjcredit.gov.cn',
        'Referer': 'http://tjcredit.gov.cn/platform/saic/index.ftl',
        'User-Agent': ua,
    }
    req = requests.session()
    req.headers = img_headers
    try:
        img = req.get(img_url, timeout=100).content
    except Exception, e:
        logger.error(e)
        raise e
    if not img:  # 判断下载的验证码是否正确
        return ''
    try:
        res_code = kill_captcha(img, "tj", "jpg")
    except Exception, e:
        logger.error("破解验证码的服务出现异常")
        logger.error(e)
        raise e
    if not res_code or len(res_code) > 100 or str(res_code) in ['None', 'wrong']:
        logger.info('验证码为:%s' % res_code)
        logger.error("破解验证码的服务出现异常,可能是下载的验证码错误，也可能破解服务出现异常！")
        return ''  # 返回空字符串，用于重复破解
    else:
        logger.info('验证码为:%s' % res_code)
        pass

    check_data = {'matchCondition': '1', 'searchContent': companyName, 'vcode': res_code, }
    check_url = 'http://tjcredit.gov.cn/platform/saic/search.ftl'

    check_res = req.post(check_url, data=check_data, timeout=100).content
    check_res_soup = BeautifulSoup(check_res, 'html5lib')
    check_res_str = str(check_res_soup)
    # 判断破解的验证码是否正确，如果验证码错误，工商网站是否返回，'验证码不正确或已失效！'的提示
    if re.compile('验证码错误').findall(check_res_str):
        return ''  # 返回空字符串，用于重复破解

    # com_info = check_res_soup.find_all('div', {'class': 'result-item'})
    com_list = re.findall('''<a href="(/platform/saic/viewBase.*?)"''', check_res_str)
    # print com_info
    if com_list:
        logger.info("搜索的公司存在!")
        return com_list[0]
    else:
        logger.info("搜索的公司不存在!")
        return None


def get_company_info(com_info):
    """
    下载网页、年报网页
    :param com_info:首页的网页
    :return:公司源码字典
    """

    if not com_info:
        raise Exception("com_list 错误")

    raw_dict = {
        "province": "tj",
        "type": "1",
        "html": "",
        "yearList": [],
        "keyword": "",
        "companyName": "",
        "json": "",
    }
    raw_base_dict = {}
    root_url = 'http://tjcredit.gov.cn'
    entId = re.compile(r'entId=(.+)').findall(com_info)
    if not entId:
        raise Exception("entId 错误")
    index_url = root_url + com_info

    req = requests.session()
    req.headers = {
        'Origin': 'http://tjcredit.gov.cn',
        'Host': 'tjcredit.gov.cn',
        'Referer': index_url,
        'User-Agent': ua,
    }

    com_str = root_url + '/platform/saic/topInfoClass.json?departmentId=scjgw&entId=' + entId[0]

    try:
        logger.info("正在下载基本信息网页！")
        gate_json_res = req.get(com_str, timeout=100).content
    except Exception, e:
        logger.error(e)
        raise e

    gate_json_res = json.loads(gate_json_res)

    # 登记信息
    dj = get_url('登记信息', gate_json_res)
    dj_res = req.get(root_url + dj).content
    raw_base_dict["base"] = dj_res

    # 股东信息详情
    raw_share_detail_list = []
    share_detail_url = 'http://tjcredit.gov.cn/saicpf/gsgdcz'
    re.findall("<tr>.*</tr>", dj_res)
    share_soup_list = BeautifulSoup(dj_res, "html5lib").find_all("table", attrs={"id": "touziren"})
    if share_soup_list:
        share_detail_list = re.findall("<tr>.*?</tr>", str(share_soup_list[0]), re.S)
        if len(share_detail_list) > 1:
            for a_detail in share_detail_list[1:]:
                share_detail_params = re.findall('''getShareHolder\(\'(.*?)\',\'(.*?)\'\)"''', a_detail)
                if not share_detail_params or len(share_detail_params[0]) <= 1:
                    raw_share_detail = ""
                else:
                    share_detail_req_params = {
                        'gdczid': share_detail_params[0][0],
                        'entid': share_detail_params[0][1],
                        'issaic': '1',
                        'hasInfo': '0',
                    }
                    raw_share_detail = req.get(share_detail_url, params=share_detail_req_params).content

                raw_share_detail_list.append(raw_share_detail)

    raw_base_dict["share_detail"] = raw_share_detail_list

    # 第二页 备案信息
    ba_url = get_url('备案信息', gate_json_res)
    if ba_url:
        raw_ba_res = req.get(root_url + ba_url).content
    else:
        raw_ba_res = ""
    raw_base_dict["beian"] = raw_ba_res

    # 经营异常信息
    abnormal_url = get_url('经营异常信息', gate_json_res)
    if abnormal_url:
        raw_abnormal_res = req.get(root_url + abnormal_url).content
    else:
        raw_abnormal_res = ""
    raw_base_dict["abnormal"] = raw_abnormal_res

    # 抽查检查信息
    check_message_url = get_url('抽查检查信息', gate_json_res)
    if check_message_url:
        raw_check_message_res = req.get(root_url + check_message_url).content
    else:
        raw_check_message_res = ""
    raw_base_dict["check_message"] = raw_check_message_res

    raw_dict["html"] = raw_base_dict

    '''
    # 年报
    # 年报里面的股东信息、股权变更信息、知识产权出资登记信息等都没有信息，点击按钮也不发送请求
    '''
    nblist_url = root_url + '/report/nblist?entid=' + entId[0]
    raw_year_index = req.get(nblist_url).content
    year_params_list = re.findall('''onclick="nbdetail\(\'(.*?)\',\'(.*?)\'\)"''', raw_year_index)

    raw_year_report_list = []
    for item in year_params_list:
        if not item or len(item) <= 1 or not item[1]:
            continue
        raw_year_report_dict = {}
        year = item[1]
        year_report_url = 'http://tjcredit.gov.cn/report/annals'
        raw_year_base_res = req.get(year_report_url, params={"entid": entId[0], "year": year, "hasInfo": "0"}).content
        raw_year_report_dict["year"] = year
        raw_year_report_dict["year_base"] = raw_year_base_res
        raw_year_report_list.append(raw_year_report_dict)

    raw_dict["yearList"] = raw_year_report_list

    return raw_dict


def extract_base_info(raw_dict):
    if not raw_dict:
        return None
    print json.dumps(raw_dict, ensure_ascii=False, indent=4)
    raw_html = raw_dict.get("html", {})
    if not raw_html:
        raise Exception("raw_dict 错误")
    raw_base = raw_html.get("base", "")
    if not raw_base:
        raise Exception("raw_dict 错误")

    res_base_dict = copy.deepcopy(TE.void_base_dict)

    # 基本信息
    raw_base_html = table.table_clean(raw_base, "基本信息")
    if not raw_base_html:
        raise Exception("基本信息")
    res_base_dict["basicList"] = table.index("基本信息", raw_base_html)
    res_base_dict["province"] = "tj"

    # 股东信息
    raw_share_html = table.table_clean(raw_base, "股东信息") or table.table_clean(raw_base, "股东（发起人）信息")
    share_holder_list_1 = table.index("股东信息", raw_share_html) if raw_share_html else []
    res_base_dict["shareHolderList"] = share_holder_list_1

    # 股东详细信息
    raw_share_detail_list = raw_html.get("share_detail", [])
    for a_detail in raw_share_detail_list:
        if not a_detail:
            continue
        a_share_detail_table = table.table_clean(a_detail, "股东及出资信息")
        trs = re.findall("<tr.*?</tr>", re.sub("\s", "", a_share_detail_table)) if a_share_detail_table else []
        if trs and len(trs) >= 5:
            tds_1 = re.findall("<td.*?>(.*?)</td>", trs[3])
            tds_2 = re.findall("<td.*?>(.*?)</td>", trs[4])
            if tds_1 and len(tds_1) >= 3 and tds_2 and len(tds_2) >= 6:
                shareHolder_dict = {
                    'shareholderName': tds_1[0],
                    'shareholderType': '',  # 股东类型
                    'country': '',  # 国别
                    'subConam': tds_1[1],  # 认缴出资额(单位:万元)
                    'regCapCur': '',  # 币种
                    'conDate': parse_time(tds_2[1]),  # 出资日期
                    'fundedRatio': '',  # 出资比例
                    # 'funded': '',
                }
                # 融合股东详情页和首页的股东信息
                for iii, a_detail_1 in enumerate(share_holder_list_1):
                    if a_detail_1.get("shareholderName", "") == tds_1[0] and tds_1[0]:
                        shareHolder_dict["shareholderType"] = a_detail_1.get("shareholderType", "")
                        share_holder_list_1[iii] = shareHolder_dict
    res_base_dict["shareHolderList"] = share_holder_list_1

    # 变更信息
    raw_alter_html = table.table_clean(raw_base, "变更信息")
    res_base_dict["alterList"] = table.index("变更信息", raw_alter_html) if raw_alter_html else []

    raw_beian = raw_html.get("beian", "")
    if raw_beian:
        # 主要人员信息
        raw_person_html = table.table_clean(raw_beian, "主要人员信息") or table.table_clean(raw_beian, "家庭成员信息")
        res_base_dict["personList"] = table.index("主要人员信息", raw_person_html) if raw_person_html else []

        # 分支机构信息
        raw_branch_html = table.table_clean(raw_beian, "分支机构信息")
        res_base_dict["filiationList"] = table.index("分支机构信息", raw_branch_html) if raw_branch_html else []

        # 清算信息
        raw_liquidation_html = table.table_clean(raw_beian, "清算信息")
        res_base_dict["liquidationList"] = table.index("清算信息", raw_liquidation_html) if raw_liquidation_html else []

    # 经营异常信息
    raw_abnormal_html = raw_html.get("abnormal", "")
    res_base_dict["abnormalOperation"] = table.index("经营异常信息", raw_abnormal_html) if raw_abnormal_html else []

    # 抽查检查信息
    raw_check_message_html = raw_html.get("check_message", "")
    res_base_dict["checkMessage"] = table.index("抽查检查信息", raw_check_message_html) if raw_check_message_html else []

    return res_base_dict


def extract_year_info(raw_dict):
    # 年报
    if not raw_dict:
        return None
    raw_year_json_list = raw_dict.get("yearList", [])
    if not raw_year_json_list:
        return []

    res_year_list = []
    for a_year_item in raw_year_json_list:
        res_year_dict = {}
        year = a_year_item.get("year", "")
        raw_year_base = a_year_item.get("year_base", "")

        # 企业基本信息
        year_report_base = table.table_clean(raw_year_base, '企业基本信息') or table.table_clean(raw_year_base, "基本信息")
        res_year_dict['baseInfo'] = table.report_index('企业基本信息', year_report_base) if year_report_base else {}
        res_year_dict['year'] = year

        # 网站或网店信息
        year_report_web = table.table_clean(raw_year_base, '网站或网店信息')
        res_year_dict['website'] = table.report_index('网站或网店信息', year_report_web) if year_report_web else {}

        # 发起人及出资信息
        year_report_investor = table.table_clean(raw_year_base, '发起人及出资信息') or table.table_clean(raw_year_base,
                                                                                                 '股东及出资信息')
        try:
            res_year_dict['investorInformations'] = table.report_index('股东及出资信息',
                                                                       year_report_investor) if year_report_investor else []
        except:
            res_year_dict['investorInformations'] = []
        # 企业资产状况信息
        year_report_asset = table.table_clean(raw_year_base, '企业资产状况信息')
        res_year_dict['assetsInfo'] = table.report_index('企业资产状况信息', year_report_asset) if year_report_asset else {}

        # 股权变更信息
        year_report_equity = table.table_clean(raw_year_base, '股权变更信息')
        try:
            res_year_dict['equityChangeInformations'] = table.report_index('股权变更信息',
                                                                           year_report_equity) if year_report_equity else []
        except:
            res_year_dict['equityChangeInformations'] = []

        # 修改记录
        year_report_change = table.table_clean(raw_year_base, '修改记录')
        res_year_dict['changeRecords'] = table.report_index('修改记录', year_report_change) if year_report_change else []

        # 对外投资信息
        year_report_invest = table.table_clean(raw_year_base, '对外投资信息')
        try:
            res_year_dict['entinvItemList'] = table.report_index('对外投资信息',
                                                                 year_report_invest) if year_report_invest else []
        except:
            res_year_dict['entinvItemList'] = []

        res_year_list.append(res_year_dict)

    return res_year_list


def search2(companyName, MAXTIME=40):
    res = ''
    asic_dict = {}
    # MAXTIME = 20
    a_time = MAXTIME
    while a_time > 0:
        # print res, '*'*20
        if res is None:  # 公司不存在
            return None
        elif res == '':  # 验证码错误
            if a_time < MAXTIME:
                logger.error("重复破解验证码!当前设定重复破解次数为:%s, 剩余次数为:%s " % (MAXTIME, a_time))
            a_time -= 1
            try:
                res = download_captcha_kill(companyName)
            except Exception, e:
                traceback.print_exc(e)
                raise e
        else:
            break
    com_list = res
    res = get_company_info(com_list)
    if a_time <= 1 and res == '':
        raise Exception("多次破解验证码错误,当前设置次数为：%s" % MAXTIME)
    else:
        raw_dict = res
        try:
            asic_dict = extract_base_info(raw_dict)
            year_list = extract_year_info(raw_dict)
            company_name = asic_dict['basicList'][0].get('enterpriseName', '')
            company_name = company_name if company_name else companyName
            res['companyName'] = company_name

            asic_dict['yearReportList'] = year_list
            gate_method = {
                'url': 'http://tjcredit.gov.cn',
                'method': 'post',
                'province': 'tj',
                'companyName': company_name,
                'data': com_list,
            }

            return res, asic_dict, gate_method

        except Exception, e:
            logger.info(e)
            res['companyName'] = companyName
            gate_method = {
                'url': 'http://tjcredit.gov.cn',
                'method': 'post',
                'province': 'tj',
                'companyName': companyName,
                'data': com_list,
            }
            return res, None, gate_method


def search(companyName):
    return search2(companyName)[1]


def search3(gate_method):
    if 'data' not in gate_method:
        raise Exception("gate_method error, doesn't have `data` key")
    com_list = gate_method.get('data')
    res = get_company_info(com_list)
    companyName = gate_method.get('companyName', '')

    raw_dict = res
    try:
        asic_dict = extract_base_info(raw_dict)
        year_list = extract_year_info(raw_dict)
        company_name = asic_dict['basicList'][0].get('enterpriseName', '')
        company_name = company_name if company_name else companyName
        res['companyName'] = company_name

        asic_dict['yearReportList'] = year_list
        gate_method = {
            'url': 'http://tjcredit.gov.cn',
            'method': 'post',
            'province': 'tj',
            'companyName': company_name,
            'data': com_list,
        }

        return res, asic_dict, gate_method

    except Exception, e:
        logger.info(e)
        res['companyName'] = companyName
        gate_method = {
            'url': 'http://tjcredit.gov.cn',
            'method': 'post',
            'province': 'tj',
            'companyName': companyName,
            'data': com_list,
        }
        return res, None, gate_method


if __name__ == "__main__":
    # companyName = u'天津渤海化工集团有限责任公司'
    # companyName = u'天津娃哈哈宏振饮料有限公司'
    # companyName = u'天津正和金属有限公司'
    # companyName = u'天津嘉美投资发展有限公司'
    # companyName = u'四川长虹电器股份有限公司天津销售分公司'
    # companyName = u'威实（天津）食品开发有限公司'
    # companyName = u'狮迈（上海）贸易有限公司天津分公司'
    # companyName = u'利胜地中海航运（上海）有限公司天津分公司'
    # companyName = u'天津中原物业顾问有限公司'
    # companyName = u'天津市和平区雅思培训中心'
    # companyName = u'天津市红桥区立嘉行房屋信息咨询中心'
    # companyName = u'天津市华泰动力网络科技有限公司'
    # companyName = u'顺驰（中国）不动产网络集团有限公司'
    # companyName = u'天津滨海新区财富资产管理有限公司'
    # companyName = u'天津南环铁路有限公司'
    # companyName = u'天津众鑫房地产咨询服务有限公司'
    # companyName = u'天津市鼎大模具有限公司'
    # companyName = u'香格里拉国际饭店管理有限公司'
    # companyName = u'天津市春华校园体育设施有限公司'
    # companyName = u'天津市海益嘉国际贸易有限公司'
    # companyName = u'天津丽茂源松节能技术有限公司'
    # companyName = u'天津优福科技有限公司'
    # 经营异常
    # companyName = u'天津市宝坻区瑞达快餐店'
    # companyName = u'天津市江波商务信息咨询有限公司'

    # 抽查检查
    # companyName = u'天津南环铁路有限公司'
    companyName = u'春秋时代（天津）影业有限公司'


    # province = 'tj'
    # result = search(companyName)
    result = search2(companyName)
    import json

    print json.dumps(result, indent=4, ensure_ascii=False)
